import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import plotly.graph_objects as go
data = pd.read_csv(r'/home/apenwell/DataFun/torchlight_practice_data.csv') #change this line to ur file path
print(data)
url product_facet metadata_facet contenttype_facet page_title 0 http1 openshift installation documentation Title 1 1 http1 openshift installation documentation Title 1 2 http2 rhel migration solution Title 2 3 http2 rhel migration solution Title 2 4 http2 rhel migration solution Title 2 5 http3 sateliite development documentation Title 3 6 http4 cloud administration security Title 4 7 http5 openstack migration article Title 5 8 http6 jboss troubleshooting video Title 6 9 http7 ansible troubleshooting documentation Title 7 10 http7 ansible troubleshooting documentation Title 7 11 http7 ansible troubleshooting documentation Title 7 12 http7 ansible troubleshooting documentation Title 7 13 http8 data grid administration solution Title 8 14 http9 gluster installation security Title 9 15 http10 ceph troubleshooting solution Title 10
#finding all unique values in the metadata_facet column and outputting a count of each value
metadata_counts = data['metadata_facet'].value_counts()
print(metadata_counts)
cnum = np.arange(len(metadata_counts))
troubleshooting 6 migration 4 installation 3 administration 2 development 1 Name: metadata_facet, dtype: int64
#putting the above output into a bar graph
plt.figure()
plt.barh(cnum,metadata_counts, color = ('darkred','firebrick', 'red', 'tomato', 'lightsalmon'))
#ticks_loc = plt.xticks(np.arange(0, 1, step=0.2)) # Set label locations.
ticks = plt.yticks(np.arange(5), ['troubleshooting', 'migration', 'installation', 'administration',
'development']) # Set text labels and properties
plt.ylabel('Metadata Facet')
plt.xlabel('Number of resources')
plt.title('Resources by Metadata Facet')
#plt.savefig('metadata_bar_chart.png', transparent = False)
Text(0.5, 1.0, 'Resources by Metadata Facet')
#putting the same data into a pie chart
plt.figure()
mylabels = ['troubleshooting', 'migration', 'installation', 'administration','development']
r = plt.pie(metadata_counts, labels = mylabels, autopct='%.0f%%')
plt.title('Resources by Metadata Facet')
Text(0.5, 1.0, 'Resources by Metadata Facet')
#working on gauges of percentages
fig = go.Figure(go.Indicator(
domain = {'x': [0, 1], 'y': [0, 1]},
value = 80,
mode = "gauge+number+delta",
number = {'suffix': "%"},
title = {'text': "Docs Impressions vs Docs Clicked"},
delta = {'reference': 60}, #the number here could represent the percentages from the previous year
gauge = {'axis': {'range': [None, 100]},
'bar': {'color': "red"},
'steps' : [
{'range': [0, 100], 'color': "lightgray"}],
'threshold' : {'line': {'color': "green", 'width': 4}, 'thickness': 0.75, 'value': 90}}))
fig.show()
#the gauge would go from 0-100 to represent a percentage
#the red progress bar would show the percentage from the current month
#the increase/decrease below the percentage number is calculating the difference between the reference value,
#delta and the current value, value
#the green line would mark the "goal" for that given percentage (ex below: here we want a 90% CTR)
#working on gauges of percentages - option 2
fig2 = go.Figure(go.Indicator(
domain = {'x': [0, 1], 'y': [0, 1]},
value = 80,
mode = "gauge+number+delta",
number = {'suffix': "%"},
title = {'text': "Docs Impressions vs Docs Clicked"},
delta = {'reference': 60}, #the number here could represent the percentages from the previous year
gauge = {'axis': {'range': [None, 100]},
'bar': {'color': "black"},
'steps' : [
{'range': [0, 50], 'color': "red"},
{'range': [50, 85], 'color': "yellow"},
{'range': [85, 100], 'color': "green"}]}))
fig2.show()
#grouping data tests
product_counts = data['product_facet'].value_counts()
grouped = data.groupby('product_facet')
for key, item in grouped:
print(grouped.get_group(key), "\n\n")
url product_facet metadata_facet contenttype_facet page_title
9 http7 ansible troubleshooting documentation Title 7
10 http7 ansible troubleshooting documentation Title 7
11 http7 ansible troubleshooting documentation Title 7
12 http7 ansible troubleshooting documentation Title 7
url product_facet metadata_facet contenttype_facet page_title
15 http10 ceph troubleshooting solution Title 10
url product_facet metadata_facet contenttype_facet page_title
6 http4 cloud administration security Title 4
url product_facet metadata_facet contenttype_facet page_title
13 http8 data grid administration solution Title 8
url product_facet metadata_facet contenttype_facet page_title
14 http9 gluster installation security Title 9
url product_facet metadata_facet contenttype_facet page_title
8 http6 jboss troubleshooting video Title 6
url product_facet metadata_facet contenttype_facet page_title
0 http1 openshift installation documentation Title 1
1 http1 openshift installation documentation Title 1
url product_facet metadata_facet contenttype_facet page_title
7 http5 openstack migration article Title 5
url product_facet metadata_facet contenttype_facet page_title
2 http2 rhel migration solution Title 2
3 http2 rhel migration solution Title 2
4 http2 rhel migration solution Title 2
url product_facet metadata_facet contenttype_facet page_title
5 http3 sateliite development documentation Title 3
#connecting the database to python. this would go above all the visualization code.
cursor = connection.cursor() #create a cursor so that you can interact with the database
from getpass import getpass
from mysql.connector import connect, Error
try:
with connect(
host="localhost",
user=input("Enter username: "),
password=getpass("Enter password: "),
database="torchlight",
) as connection:
print(connection)
except Error as e:
print(e)
#selecting all rows for a certain product (rhel in this case). this would go above all the visualization code
select_rhel = """
SELECT *
FROM WebBehavior
WHERE product_facet = 'rhel'
ORDER BY findability DESC
"""
with connection.cursor() as cursor:
cursor.execute(select_rhel)
for row in cursor.fetchall():
print(row)
#pie chart for contenttypes example - not actually going to use in this exact way
contenttype = mycursor.execute("Select contenttype_facet from WebBehavior")
result = mycursor.fetchall
contenttype = []
#Marks = []
for i in mycursor:
contenttype.append(i[0])
#Marks.append(i[1])
#plt.bar(Names, Marks)
#plt.ylim(0, 5)
#plt.xlabel("Name of Students")
#plt.ylabel("Marks of Students")
#plt.title("Student's Information")
#plt.show()
plt.pie(contenttype)
mylabels = ['troubleshooting', 'migration', 'installation', 'administration','development'] #fix this line
r = plt.pie(metadata_counts, labels = mylabels, autopct='%.0f%%') #fix this line
plt.title('Resources by Content Type')
plt.show()
#page 1 - Home Page - usable code (hopefully)
#Main Table
select count(*) as "# of Docs" from WebBehavior;
select count(*) as "# of Docs with 200 Status" from WebBehavior where Status = '200';
#Main Visual and Content Type Visual (these are going to be the same...right?)
contenttype_counts = mycursor.execute("select ContentTypeFacet, count(*) from WebBehavior Group By ContentTypeFacet") #gets counts by content type
mylabels = mycursor.execute("select distinct ContentTypeFacet from WebBehavior") #don't know if this will work lol
result = mycursor.fetchall
contenttype_counts = []
#Marks = []
for i in mycursor:
contenttype_counts.append(i[0])
plt.pie()
r = plt.pie(contenttype_counts, labels = mylabels, autopct='%.0f%%') #fix this line
plt.title('All Resources by Content Type')
plt.show()
#Content Type Table (im "ranking" these by total clicks not total queries because I think that makes more sense... is that ok?)
select ContentTypeFacet, sum(TotalClicks) from WebBehavior Group By ContentTypeFacet Order By sum(TotalClicks) DESC;
#Products Visual
product_counts = mycursor.execute("select ProductFacet, count(*) from WebBehavior Group By ProductFacet")
mylabels2 = mycursor.execute("select distinct ProductFacet from WebBehavior") #don't know if this will work lol
result = mycursor.fetchall
product_counts = []
for i in mycursor:
product_counts.append(i[0])
plt.pie()
r = plt.pie(product_counts, labels = mylabels2, autopct='%.0f%%') #fix this line
plt.title('All Resources by Product')
plt.show()
#Product Table
select ProductFacet, sum(TotalClicks) from WebBehavior Group By ProductFacet Order By sum(TotalClicks) DESC;
#Metadata Visual
metadata_counts = mycursor.execute("select MetadataFacet, count(*) from WebBehavior Group By MetadataFacet")
mylabels3 = mycursor.execute("select distinct MetadataFacet from WebBehavior") #don't know if this will work lol
result = mycursor.fetchall
metadata_counts = []
for i in mycursor:
metadata_counts.append(i[0])
plt.pie()
r = plt.pie(metadata_counts, labels = mylabels3, autopct='%.0f%%') #fix this line
plt.title('All Resources by Metadata')
plt.show()
#Metadata Table
select MetadataFacet, sum(TotalClicks) from WebBehavior Group By MetadataFacet Order By sum(TotalClicks) DESC;
#page 2 - Product Page
#Main Table - rhel
#selecting number of docs from a certain product
select count(*) as "# of Docs" from WebBehavior where ProductFacet = 'rhel';
#selecting number of docs with 200 code
select count(*) as "# of Docs with 200 Status" from WebBehavior where ProductFacet = 'rhel' and Status = '200';
#selecting number of docs with 404 code
select count(*) as "# of Docs with 404 Status" from WebBehavior where ProductFacet = 'rhel' and Status = '404';
#Content CTR - rhel
select avg(Total_CTR) from WebBehavior where ProductFacet = 'rhel';
#Content from site search - rhel
#Content from search engines - rhel
#Queries % of search - rhel
#Findability score - rhel
select avg(FindabilityScore) from WebBehavior where ProductFacet = 'rhel';
#Content Type Visual - rhel
rhel_counts = mycursor.execute("select ContentTypeFacet, count(*) from WebBehavior where ProductFacet = 'rhel' Group By ContentTypeFacet")
rhel_labels = mycursor.execute("select distinct ContentTypeFacet from WebBehavior where ProductFacet = 'rhel'") #don't know if this will work lol
result = mycursor.fetchall
rhel_counts = []
for i in mycursor:
rhel_counts.append(i[0])
plt.pie()
r = plt.pie(rhel_counts, labels = rhel_labels, autopct='%.0f%%') #fix this line
plt.title('RHEL Resources by Content Type')
plt.show()
#Metadata Visual - rhel
rhel_m_counts = mycursor.execute("select MetadataFacet, count(*) from WebBehavior where ProductFacet = 'rhel' Group By MetadataFacet")
rhel_m_labels = mycursor.execute("select distinct MetadataFacet from WebBehavior where ProductFacet = 'rhel'") #don't know if this will work lol
result = mycursor.fetchall
rhel_m_counts = []
for i in mycursor:
rhel_m_counts.append(i[0])
plt.pie()
r = plt.pie(rhel_m_counts, labels = rhel_m_labels, autopct='%.0f%%') #fix this line
plt.title('RHEL Resources by Metadata Type')
plt.show()
#Top Docs - rhel
select WebURL, FindabilityScore from WebBehavior where ProductFacet = 'rhel' order by FindabilityScore DESC
LIMIT 5;
#Top Queries - rhel
select WebURL, Keyword, TotalQueries from WebBehavior where ProductFacet = 'rhel' order by TotalQueries DESC
LIMIT 5;
#Lowest Docs - rhel
select WebURL, FindabilityScore from WebBehavior where ProductFacet = 'rhel' order by FindabilityScore
LIMIT 5;